Find SNPs in all MR genes
library(tidyverse)
get list of genes
genes <- read_csv("../output/MR_UN_graphs_node_annotation_2012.csv")
Parsed with column specification:
cols(
.default = col_double(),
.id = [31mcol_character()[39m,
trt = [31mcol_character()[39m,
name = [31mcol_character()[39m,
chrom = [31mcol_character()[39m,
subject = [31mcol_character()[39m,
AGI = [31mcol_character()[39m,
At_symbol = [31mcol_character()[39m,
At_description = [31mcol_character()[39m
)
See spec(...) for full column specifications.
genes
output BED file for use by samtools
genes %>%
select(chrom, start, end, name) %>%
mutate(start = start - 1) %>% # first base is base 0 in BED system.
# but end base is not included, so leave that alone
write_tsv("../output/MR_genes.bed", col_names = FALSE)
On whitney… Only use the UN files to keep things faster
for f in $(ls bamsnew/mapping_v1.5_merged_bam_files/*UN*bam)
do
f=$(basename $f)
echo $f
samtools view -b bamsnew/mapping_v1.5_merged_bam_files/$f -L MR_genes.bed > subsetbams/$f
samtools index subsetbams/$f
done
now run freebayes in parallel. First split up the bed file into chunks
split MR_genes.bed -n "l/12" MR_gene.bedx
parallel -j 12 freebayes --targets {} -f ~/Sequences/ref_genomes/B_rapa/genome/V1.5/Brapa_sequence_v1.5.fa subsetbams/*.bam ::: *.bedx* > MRgenes.vcf
snpeff
java -jar snpEff/snpEff.jar Brassica_rapa MRgenes.vcf > MRgenes_ann.vcf
now bring to R and process
vcf.header <- system("grep '#C' ../output/MRGenes_ann.vcf",intern = TRUE) %>%
str_replace("#","") %>% str_split(pattern = "\t") %>%
magrittr::extract2(1)
vcf.header
[1] "CHROM" "POS" "ID" "REF" "ALT"
[6] "QUAL" "FILTER" "INFO" "FORMAT" "unknown"
vcf.data <- read_tsv("../output/MRGenes_ann.vcf",na = c("","NA","."),comment="#",col_names = vcf.header) %>%
select(-FILTER) %>%
mutate(ID=str_c(CHROM,"_",POS))
Parsed with column specification:
cols(
CHROM = [31mcol_character()[39m,
POS = [32mcol_double()[39m,
ID = [33mcol_logical()[39m,
REF = [31mcol_character()[39m,
ALT = [31mcol_character()[39m,
QUAL = [32mcol_double()[39m,
FILTER = [33mcol_logical()[39m,
INFO = [31mcol_character()[39m,
FORMAT = [31mcol_character()[39m,
unknown = [31mcol_character()[39m
)
vcf.data
split the genotype record:
vcf.data <- vcf.data %>%
separate(unknown, into=c("GT", "DP", "AD", "RO", "QR", "AO", "QA", "GL"), convert = TRUE, sep=":")
vcf.data
Filter. Because all of the RILs are considered together, we want to keep SNPs that are heterozygous (meaning segregating)
vcf.data <- vcf.data %>% filter(!(GT %in% c("0/0", "1/1")))
vcf.data
filter by depth. We had a large number of samples go into this (like 400). Filter to require a depth of at least 400
vcf.data <- vcf.data %>% filter(DP > 400)
vcf.data
scanning through this, qualities and depth all look good so will not do further filtering.
now pull out the snp info…
first, pull out the full annotation ino
vcf.data <- vcf.data %>%
mutate(ANN=str_extract(INFO,"ANN.*$"),
ANN=str_remove(ANN, "ANN="))
head(vcf.data$ANN)
[1] "G|missense_variant|MODERATE|Bra037847|Bra037847|transcript|Bra037847|protein_coding|1/2|c.83C>G|p.Thr28Ser|83/711|83/711|28/236||,G|downstream_gene_variant|MODIFIER|Bra037846|Bra037846|transcript|Bra037846|protein_coding||c.*4418C>G|||||4418|WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS"
[2] "G|synonymous_variant|LOW|Bra037847|Bra037847|transcript|Bra037847|protein_coding|2/2|c.234A>G|p.Pro78Pro|234/711|234/711|78/236||,G|downstream_gene_variant|MODIFIER|Bra037846|Bra037846|transcript|Bra037846|protein_coding||c.*4663A>G|||||4663|WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS"
[3] "A|synonymous_variant|LOW|Bra037847|Bra037847|transcript|Bra037847|protein_coding|2/2|c.249C>A|p.Ile83Ile|249/711|249/711|83/236||,A|downstream_gene_variant|MODIFIER|Bra037846|Bra037846|transcript|Bra037846|protein_coding||c.*4678C>A|||||4678|WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS"
[4] "T|synonymous_variant|LOW|Bra037847|Bra037847|transcript|Bra037847|protein_coding|2/2|c.256C>T|p.Leu86Leu|256/711|256/711|86/236||,T|downstream_gene_variant|MODIFIER|Bra037846|Bra037846|transcript|Bra037846|protein_coding||c.*4685C>T|||||4685|WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS"
[5] "A|missense_variant|MODERATE|Bra037847|Bra037847|transcript|Bra037847|protein_coding|2/2|c.346C>A|p.His116Asn|346/711|346/711|116/236||,A|upstream_gene_variant|MODIFIER|Bra037848|Bra037848|transcript|Bra037848|protein_coding||c.-4966C>A|||||4966|,A|downstream_gene_variant|MODIFIER|Bra037846|Bra037846|transcript|Bra037846|protein_coding||c.*4775C>A|||||4775|WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS"
[6] "T|synonymous_variant|LOW|Bra008566|Bra008566|transcript|Bra008566|protein_coding|20/20|c.2634T>A|p.Leu878Leu|2634/2691|2634/2691|878/896||,T|upstream_gene_variant|MODIFIER|Bra008565|Bra008565|transcript|Bra008565|protein_coding||c.-265T>A|||||265|,T|downstream_gene_variant|MODIFIER|Bra008563|Bra008563|transcript|Bra008563|protein_coding||c.*3844A>T|||||3844|,T|downstream_gene_variant|MODIFIER|Bra008564|Bra008564|transcript|Bra008564|protein_coding||c.*881A>T|||||881|"
the tricky part is that each SNP may have a different number of annotations, so I need to create a list-column
vcf.data.filter <- vcf.data %>%
filter(Annotation_Impact != "LOW",
Annotation_Impact != "MODIFIER") %>%
arrange(CHROM, POS)
vcf.data.filter
vcf.data.filter <- vcf.data.filter %>%
mutate(IGV=str_c(CHROM, ":", POS, "-", POS)) %>%
select(CHROM, POS, ID, IGV, everything())
vcf.data.filter
write_csv(vcf.data.filter, "MR_gene_Annotated_Filtered_SNPs.csv")